1 Libraries

library(httr)
library(data.table)
library(dplyr)
library(lubridate)
library(knitr)
library(highcharter)
library(DT)
library(caret)
library(tibble) 
library(rsample)   
library(jtools)



2 Daten importieren und vorbereiten

2.1 Meteorologische Daten

fread("https://data.geo.admin.ch/ch.meteoschweiz.klima/nbcn-tageswerte/nbcn-daily_BAS_previous.csv", sep = ";", colClasses = c("character", "Date", rep("numeric", 10))) %>%
  mutate(
    timestamp = as.POSIXct(
      paste0(substr(date, 1, 4), "-", substr(date, 5, 6), "-", substr(date, 7, 8), " 00:00:00"),
      format="%Y-%m-%d %H:%M:%S"
    )
  ) %>%
  mutate_at(c("gre000d0", "hto000d0", "nto000d0", "prestad0", "rre150d0", "sre000d0", "tre200d0", "tre200dn", "tre200dx", "ure200d0"), as.numeric) %>%
  bind_rows(
    fread("https://data.geo.admin.ch/ch.meteoschweiz.klima/nbcn-tageswerte/nbcn-daily_BAS_current.csv", sep = ";", colClasses = c("character", "Date", rep("numeric", 10))) %>%
      mutate(
        timestamp = as.POSIXct(
          paste0( substr(date, 1, 4), "-", substr(date, 5, 6), "-", substr(date, 7, 8), " 00:00:00"),
          format="%Y-%m-%d  %H:%M:%S", tz="Europe/Berlin"
        )
      ) %>%
      mutate_at(c("gre000d0", "hto000d0", "nto000d0", "prestad0", "rre150d0", "sre000d0", "tre200d0", "tre200dn", "tre200dx", "ure200d0"), as.numeric)
  ) %>%
  relocate(timestamp) %>%
  select(-c(date, `station/location`)) %>%
  mutate(
    year = as.numeric(substr(timestamp, 1, 4)),
    month = as.numeric(substr(timestamp, 6, 7)),
    day = as.numeric(substr(timestamp, 9, 10))
  ) %>%
  data.frame() %>%
  assign("meteo", ., inherits = TRUE)



2.2 Feiertage, Ferien und Veranstaltungen

httr::GET("https://data.bs.ch/explore/dataset/100074/download/?format=csv&timezone=Europe%2FBerlin") %>%
  content(., "text") %>%
  fread(sep=";") %>%
  select(tag_datum, name, code, kategorie_name) %>%
  filter(name != "Fasnachtsmontag", name != "Fasnachtsmittwoch", name != "Dies Academicus") %>%
  filter(!(tag_datum == "2008-05-01 00:00:00" & name == "Tag der Arbeit")) %>% # Tag der Arbeit doubles with Auffahrt
  filter(kategorie_name %in% c("Feiertag", "Ferien") | code == "herbstm") %>%
  filter(name != "Semesterferien") %>%
  assign("rd_veranst", ., inherits = TRUE)

rd_veranst %>%
  data.frame() %>%
  mutate(Herbstmesse = if_else(code == "herbstm", "Herbstmesse", "")) %>%
  select(tag_datum, Herbstmesse) %>%
  filter(Herbstmesse != "") %>%
  full_join(
    rd_veranst %>%
      mutate(Feiertage = if_else(kategorie_name == "Feiertag", name, "")) %>%
      select(tag_datum, Feiertage) %>%
      filter(Feiertage != ""),
    by = "tag_datum"
  ) %>%
  full_join(
    rd_veranst %>%
      mutate(Ferien = if_else(kategorie_name == "Ferien", name, "")) %>%
      select(tag_datum, Ferien) %>%
      filter(Ferien != ""),
    by = "tag_datum"
  ) %>%
  mutate(timestamp = as.POSIXct(tag_datum, tz="Europe/Berlin")) %>%
  filter(year(timestamp) > 2011) %>%
  select(timestamp, Herbstmesse, Feiertage, Ferien) %>%
  mutate(
    year = lubridate::year(timestamp), 
    month = lubridate::month(lubridate::floor_date(timestamp, "month")),
    day = lubridate::day(lubridate::floor_date(timestamp, "day"))
  ) %>%
  data.frame() %>%
  assign("Veranstaltungen", ., inherits = TRUE)

rm(events, rd_veranst)



2.3 Gasverbrauchs-Daten

httr::GET("https://data.bs.ch/explore/dataset/100304/download/?format=csv&timezone=Europe%2FBerlin")  %>%
  content(., "text") %>%
  fread(sep=";") %>%
  group_by(year, month, day) %>%
  filter(year > 2011) %>%
  summarise(gasverbrauch = sum(value, na.rm = T)) %>%
  ungroup() %>%
  mutate(timestamp = as.POSIXct(
    paste0(year, "-", month, "-", day, " 00:00:00"), format="%Y-%m-%d  %H:%M:%S", tz="Europe/Berlin"
    )
  ) %>%
  mutate(gasverbrauch=gasverbrauch/1000000) %>%
  relocate(timestamp) %>%
  assign("gas_daily", ., inherits = TRUE)



2.4 Datensätze zusammenfügen

meteo %>%
  full_join(Veranstaltungen %>%
              select(-timestamp), by = c("year" = "year", "month" = "month", "day" = "day")) %>%
  full_join(gas_daily %>%
              select(-timestamp), by = c("year" = "year", "month" = "month", "day" = "day")) %>%
    mutate(weekday = lubridate::wday(as.Date(timestamp), label = TRUE, abbr = TRUE),
         daytype = if_else(weekday %in% c(1,7), "Wochenende", "Werktage")) %>%
  relocate(timestamp, gasverbrauch) %>%
  filter(!is.na(gasverbrauch)) %>%
  mutate(HGT = if_else(tre200d0 <= 12, 20-tre200d0, 0)) %>%
  mutate(Herbstmesse = if_else(is.na(Herbstmesse), "No", Herbstmesse),
         Feiertage = if_else(is.na(Feiertage), "No", Feiertage),
         Ferien = if_else(is.na(Ferien), "No", Ferien)) %>%
  mutate(time = as.Date(timestamp, tz = 'Europe/Berlin')) %>%
  select(-timestamp) %>%
  relocate(time) %>%
  slice(1:(n() - 1)) %>%              
  assign("Data", ., inherits = TRUE)

# Variable für Energiespar-Kampagnen
start_date= as.Date("2022-09-01")
end_date= as.Date("2023-01-31")

Data %>%
  mutate(time = as.Date(time),
         Feiertage_dummy = if_else(Feiertage == "No", 0, 1),
         Ferien_dummy = if_else(Ferien == "No", 0, 1),
         Herbstmesse_dummy = if_else(Herbstmesse == "No", 0, 1),
         Wochenende_dummy = if_else(daytype == "Werktage", 0, 1),
         Energiespar_Kampagnen = ifelse(as.Date(time) %within% interval(start_date, end_date), 1,0),
         year_month = paste(year, 
                            formatC(month, width = 2, format = "d", flag = "0"),
                            sep="-"),
         Energiespar_Kampagnen_monatlich = ifelse(Energiespar_Kampagnen == 1, as.character(year_month), "0"),
         weekday = factor(weekday, ordered = FALSE)
  ) %>%
  slice(1:(n() - 1)) %>% # Unvollständigen Tag für den Gasverbrauch entfernen.
  assign("Data_selec", ., inherits = TRUE)



3 OLS Regression

Data_selec_model <- Data_selec %>%
  filter(time < as.Date("2024-10-31"))

set.seed(12345)

inTraining <- createDataPartition(Data_selec_model$gasverbrauch, p = .7, list = FALSE)
training <- Data_selec_model[ inTraining,]
testing  <- Data_selec_model[-inTraining,]

fitControl <- trainControl(method = "repeatedcv",
                           number = 10,
                           repeats = 10)

set.seed(54321)
ols_Fit1 <- train(gasverbrauch ~ time + as.factor(month) + weekday +
                  as.factor(Ferien_dummy) + as.factor(Feiertage_dummy) + as.factor(Herbstmesse_dummy) +
                  gre000d0 + hto000d0 + nto000d0 + prestad0 + rre150d0 + 
                  sre000d0 +
                  tre200d0 + tre200dn + tre200dx + ure200d0 +
                  Energiespar_Kampagnen +
                  HGT +
                  I(HGT^2) + I(tre200d0^2),
                  data = training, 
                  method = "lm",
                  trControl = fitControl,
                  verbose = TRUE)
# ols_Fit1
# ols_Fit1$resample
# summary(ols_Fit1)

3.1 Leistung cross-validation

ols_Fit1$results %>% 
  round(digits = 3) -> ols_Fit1$results


data.frame(
  RMSE = paste0(ols_Fit1$results[2], " &pm; ", ols_Fit1$results[5]),
  Rsquared = paste0(ols_Fit1$results[3], " &pm; ", ols_Fit1$results[6]),
  MAE = paste0(ols_Fit1$results[4], " &pm; ", ols_Fit1$results[7])
) %>%
  kable()
RMSE Rsquared MAE
0.912 ± 0.072 0.966 ± 0.006 0.701 ± 0.056

3.2 Leistung Test

testing %>%
  mutate(pred = predict(ols_Fit1, newdata = testing)) %>%
  select(obs = gasverbrauch, pred) %>%
  defaultSummary() %>%
  round(digits = 3) -> performance_test

data.frame(
  RMSE = performance_test[1],
  Rsquared = performance_test[2],
  MAE = performance_test[3]
) %>%
  kable(row.names = F, align = "lll") 
RMSE Rsquared MAE
0.974 0.959 0.741

3.3 Endgültiges Modell

final_model <- lm(gasverbrauch ~ time + as.factor(month) + weekday +
                  as.factor(Ferien_dummy) + as.factor(Feiertage_dummy) + as.factor(Herbstmesse_dummy) +
                  gre000d0 + hto000d0 + 
                  # nto000d0 + prestad0 + rre150d0 + sre000d0 + 
                  tre200d0 + tre200dn + 
                  # tre200dx + ure200d0 +
                  Energiespar_Kampagnen +
                  # HGT + I(HGT^2) + 
                  I(tre200d0^2),
                  data = Data_selec_model)


summ(final_model,
     model.info = T,
     model.fit = F,
     digits = getOption("jtools-digits", 3),
     stars = T,
     robust=T
)
## MODEL INFO:
## Observations: 1156
## Dependent Variable: gasverbrauch
## Type: OLS linear regression 
## 
## Standard errors: Robust, type = HC3
## ----------------------------------------------------------------------
##                                         Est.    S.E.    t val.       p
## ----------------------------------- -------- ------- --------- -------
## (Intercept)                           51.680   1.801    28.701   0.000
## time                                  -0.002   0.000   -18.924   0.000
## as.factor(month)2                     -0.620   0.176    -3.524   0.000
## as.factor(month)3                     -1.292   0.173    -7.452   0.000
## as.factor(month)4                     -2.776   0.197   -14.055   0.000
## as.factor(month)5                     -3.564   0.198   -18.033   0.000
## as.factor(month)6                     -3.401   0.213   -15.999   0.000
## as.factor(month)7                     -3.073   0.218   -14.102   0.000
## as.factor(month)8                     -3.529   0.207   -17.061   0.000
## as.factor(month)9                     -3.407   0.190   -17.897   0.000
## as.factor(month)10                    -2.639   0.165   -15.989   0.000
## as.factor(month)11                    -1.581   0.188    -8.414   0.000
## as.factor(month)12                     0.028   0.194     0.146   0.884
## weekdayMo                              0.009   0.105     0.089   0.929
## weekdayDi                             -0.022   0.104    -0.209   0.835
## weekdayMi                             -0.051   0.098    -0.524   0.601
## weekdayDo                             -0.150   0.097    -1.545   0.123
## weekdayFr                             -0.650   0.103    -6.303   0.000
## weekdaySa                             -0.685   0.100    -6.869   0.000
## as.factor(Ferien_dummy)1              -0.430   0.080    -5.362   0.000
## as.factor(Feiertage_dummy)1           -0.474   0.156    -3.035   0.002
## as.factor(Herbstmesse_dummy)1         -0.720   0.153    -4.720   0.000
## gre000d0                              -0.002   0.001    -3.611   0.000
## hto000d0                              -0.284   0.076    -3.760   0.000
## tre200d0                              -0.872   0.030   -28.645   0.000
## tre200dn                              -0.067   0.022    -3.093   0.002
## Energiespar_Kampagnen                 -1.709   0.109   -15.688   0.000
## I(tre200d0^2)                          0.019   0.001    24.995   0.000
## ----------------------------------------------------------------------
RMSE Rsquared MAE
0.902 0.966 0.695



Data_selec %>%
  bind_cols(
    predict(final_model, Data_selec, interval = "prediction")
  ) %>%
  slice(1:(n() - 1)) -> Data_model_ols

highchart(type = "stock") %>%
  hc_add_series(Data_model_ols, "line", hcaes(time, gasverbrauch), color = "#008AC3",
                tooltip = list(pointFormat = "Effektiver Gasverbrauch: {point.gasverbrauch:.2f} GWh",
                               shared = TRUE),
                zIndex = 1) %>%
  hc_xAxis(title = list(text = "")) %>%
  hc_add_series(Data_model_ols, "line", hcaes(time, fit), color = "#B375AB",
                tooltip = list(pointFormat = "Erwarteter Gasverbrauch: {point.fit:.2f} GWh",
                               shared = TRUE),
                zIndex = 2) %>%
  hc_add_series(Data_model_ols, type = "arearange",
                hcaes(x = time, low = lwr, high = upr),
                zIndex = 0,
                color = "#E7CEE2",
                tooltip = list(pointFormat = "95% Konfidenzintervall: {point.lwr:.2f} - {point.upr:.2f} GWh"), shared = TRUE
  ) %>%
  hc_yAxis(floor=0, title = list(text = ""), opposite = FALSE) %>%
  hc_plotOptions(series = list(marker = list(enabled = FALSE))) %>%
  hc_rangeSelector(selected = 0)
write.csv2(Data_model_ols %>%
             mutate(vgl_real_minus_forecast = gasverbrauch - fit) %>%
             select(
               time,
               gasverbrauch,
               forecast = fit,
               vgl_real_minus_forecast,
               forecast_lowFI = lwr,
               forecast_highFI = upr
             ),
           "100353_gasverbrauch.csv", row.names=F, na = "")




Hinweis: Je nach Kombination von Betriebssystemen und Versionen von RStudio, R und den verwendeten Pakete können die Ergebnisse leicht von den publizierten Resultaten abweichen. Die angewendete Konfiguration lautet:


sessionInfo()
## R version 4.3.3 (2024-02-29 ucrt)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 11 x64 (build 22631)
## 
## Matrix products: default
## 
## 
## locale:
## [1] LC_COLLATE=German_Switzerland.utf8  LC_CTYPE=German_Switzerland.utf8   
## [3] LC_MONETARY=German_Switzerland.utf8 LC_NUMERIC=C                       
## [5] LC_TIME=German_Switzerland.utf8    
## 
## time zone: Europe/Zurich
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] jtools_2.3.0      rsample_1.2.1     tibble_3.2.1      caret_7.0-1      
##  [5] lattice_0.22-5    ggplot2_3.5.1     DT_0.33           highcharter_0.9.4
##  [9] knitr_1.50        lubridate_1.9.4   dplyr_1.1.4       data.table_1.17.0
## [13] httr_1.4.7       
## 
## loaded via a namespace (and not attached):
##  [1] tidyselect_1.2.1     timeDate_4041.110    fastmap_1.2.0       
##  [4] pROC_1.18.5          digest_0.6.37        rpart_4.1.23        
##  [7] timechange_0.3.0     lifecycle_1.0.4      survival_3.5-8      
## [10] magrittr_2.0.3       compiler_4.3.3       rlang_1.1.5         
## [13] sass_0.4.9           tools_4.3.3          igraph_2.1.4        
## [16] yaml_2.3.10          htmlwidgets_1.6.4    curl_6.2.0          
## [19] plyr_1.8.9           TTR_0.24.4           withr_3.0.2         
## [22] purrr_1.0.2          stats4_4.3.3         nnet_7.3-19         
## [25] grid_4.3.3           xts_0.14.1           colorspace_2.1-1    
## [28] future_1.34.0        globals_0.16.3       scales_1.3.0        
## [31] iterators_1.0.14     MASS_7.3-60.0.1      cli_3.6.3           
## [34] rmarkdown_2.29       generics_0.1.3       rlist_0.4.6.2       
## [37] rstudioapi_0.17.1    future.apply_1.11.3  reshape2_1.4.4      
## [40] cachem_1.1.0         pander_0.6.6         stringr_1.5.1       
## [43] splines_4.3.3        assertthat_0.2.1     parallel_4.3.3      
## [46] vctrs_0.6.5          sandwich_3.1-1       hardhat_1.4.1       
## [49] Matrix_1.6-5         jsonlite_1.8.9       bookdown_0.42       
## [52] listenv_0.9.1        foreach_1.5.2        gower_1.0.2         
## [55] tidyr_1.3.1          jquerylib_0.1.4      recipes_1.1.1       
## [58] quantmod_0.4.26      glue_1.8.0           parallelly_1.42.0   
## [61] codetools_0.2-19     stringi_1.8.4        gtable_0.3.6        
## [64] broom.mixed_0.2.9.6  munsell_0.5.1        furrr_0.3.1         
## [67] pillar_1.10.1        htmltools_0.5.8.1    ipred_0.9-15        
## [70] lava_1.8.1           R6_2.5.1             evaluate_1.0.3      
## [73] backports_1.5.0      broom_1.0.7          bslib_0.9.0         
## [76] class_7.3-22         Rcpp_1.0.14          nlme_3.1-164        
## [79] prodlim_2024.06.25   xfun_0.51            forcats_1.0.0       
## [82] zoo_1.8-13           pkgconfig_2.0.3      ModelMetrics_1.2.2.2


Gasverbrauch im Versorgungsgebiet der IWB: https://data.bs.ch/explore/dataset/100304/

Effektiver und erwarteter täglicher Gasverbrauch: https://data.bs.ch/explore/dataset/100353/

Der Code des Modells kann selber ausgeführt und weiterentwickelt werden. Hierfür wird Renku verwendet. Renku ist eine Plattform, die verschiedene Werkzeuge für reproduzierbare und kollaborative Datenanalyseprojekte bündelt: https://renkulab.io/projects/stata/reproducible/erwarteter-gasverbrauch-basel-stadt

Webartikel: https://www.statistik.bs.ch/aktuell/gasverbrauch-2023.html